# basic imports
import pandas as pd
import numpy as np
from collections import Counter
# plots
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# settings immports
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 40
import plotly.io as pio
# notebook == HTML, iframe == Jupyter
#pio.renderers.default = "iframe"
pio.renderers.default = "notebook"
%config Completer.use_jedi = False
# Unbalance data imports
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
# ML imports
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
import sklearn.linear_model as sklin
import sklearn.metrics as met
df = pd.read_csv('Data/heart_2020_cleaned.csv')
df.head()
| HeartDisease | BMI | Smoking | AlcoholDrinking | Stroke | PhysicalHealth | MentalHealth | DiffWalking | Sex | AgeCategory | Race | Diabetic | PhysicalActivity | GenHealth | SleepTime | Asthma | KidneyDisease | SkinCancer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | No | 16.60 | Yes | No | No | 3.0 | 30.0 | No | Female | 55-59 | White | Yes | Yes | Very good | 5.0 | Yes | No | Yes |
| 1 | No | 20.34 | No | No | Yes | 0.0 | 0.0 | No | Female | 80 or older | White | No | Yes | Very good | 7.0 | No | No | No |
| 2 | No | 26.58 | Yes | No | No | 20.0 | 30.0 | No | Male | 65-69 | White | Yes | Yes | Fair | 8.0 | Yes | No | No |
| 3 | No | 24.21 | No | No | No | 0.0 | 0.0 | No | Female | 75-79 | White | No | No | Good | 6.0 | No | No | Yes |
| 4 | No | 23.71 | No | No | No | 28.0 | 0.0 | Yes | Female | 40-44 | White | No | Yes | Very good | 8.0 | No | No | No |
df.isna().sum().sum()
0
df.nunique()
HeartDisease 2 BMI 3604 Smoking 2 AlcoholDrinking 2 Stroke 2 PhysicalHealth 31 MentalHealth 31 DiffWalking 2 Sex 2 AgeCategory 13 Race 6 Diabetic 4 PhysicalActivity 2 GenHealth 5 SleepTime 24 Asthma 2 KidneyDisease 2 SkinCancer 2 dtype: int64
df = df[df.columns].replace({'Yes':1, 'No':0, 'Male':1,'Female':0,'No, borderline diabetes':'0','Yes (during pregnancy)':'1' })
df['Diabetic'] = df['Diabetic'].astype(int)
I chose to add basic visualisations to show the problem before the model ...
I'll need to use dummy bariables for categorical variables
and other techniques like big imbalance that will destroy my model
ax = sns.histplot(data=df,x='SleepTime',bins=24)
_ = ax.set_title("Histribution of Sleep Time With 24 bins")
There are 551 people that sleep 1 hour per day !
And 30 people that sleep 24 hours per day ! ( I think they are died :/)
fig, ax = plt.subplots(figsize = (13,5))
sns.kdeplot(df[df["HeartDisease"]==1]["BMI"], alpha=0.5,shade = True, color="red", label="HeartDisease", ax = ax)
sns.kdeplot(df[df["HeartDisease"]==0]["BMI"], alpha=0.5,shade = True, color="#fccc79", label="Normal", ax = ax)
plt.title('Distribution of Body Mass Index', fontsize = 18)
ax.set_xlabel("BodyMass")
ax.set_ylabel("Frequency")
ax.legend()
plt.show()
def create_count_plot(x_size,y_size, df, column_name):
fig = plt.figure(figsize=(x_size,y_size))
plt.title(column_name,fontsize=15)
ax = sns.countplot(x=df[column_name],
order=df[column_name].value_counts(ascending=False).index);
abs_values = df[column_name].value_counts(ascending=False).values
_ = ax.bar_label(container=ax.containers[0], labels=abs_values)
create_count_plot(x_size= 14, y_size = 6,df=df, column_name="AgeCategory")
fig = px.imshow(df.corr(),color_continuous_scale="Blues")
fig.update_layout(height=600)
fig.show()
plt.figure(figsize = (13,6))
plt.title('Distribution of correlation of features')
abs(df.corr()['HeartDisease']).sort_values()[:-1].plot.barh()
plt.show()
By the way - It's very intersting to figure out that Sleep Time doesn't effect Heart Disease
final_columns = ["Smoking","Stroke","PhysicalHealth","DiffWalking","Sex","AgeCategory","Diabetic","PhysicalActivity","GenHealth","HeartDisease"]
df = df[final_columns]
plt.figure(figsize=(30,30))
for i in enumerate(final_columns[:-1]):
plt.subplot(6,3,i[0]+1)
sns.countplot(i[1],hue='HeartDisease',data=df)
plt.xticks(rotation=15)
lst_of_dummy_variables = ['HeartDisease','Smoking','Stroke','DiffWalking','Sex','AgeCategory','Diabetic','PhysicalActivity','GenHealth']
df = pd.get_dummies(data=df,columns=lst_of_dummy_variables,drop_first=True)
df.head(1)
| PhysicalHealth | HeartDisease_1 | Smoking_1 | Stroke_1 | DiffWalking_1 | Sex_1 | AgeCategory_25-29 | AgeCategory_30-34 | AgeCategory_35-39 | AgeCategory_40-44 | AgeCategory_45-49 | AgeCategory_50-54 | AgeCategory_55-59 | AgeCategory_60-64 | AgeCategory_65-69 | AgeCategory_70-74 | AgeCategory_75-79 | AgeCategory_80 or older | Diabetic_1 | PhysicalActivity_1 | GenHealth_Fair | GenHealth_Good | GenHealth_Poor | GenHealth_Very good | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
ax= df["HeartDisease_1"].value_counts().plot.pie(autopct='%.2f')
_= ax.set_title("Pie of Heart Disease")
There is 10X more people without Heart Disease then people with Heart Disease :
we need to fix the dummy varibales and the imbalace in the Y target ('HeartDisease' column)
There are two methods: under sampling and over sampling
I will use under sampling becuse I have a lot of data (27373 rows of Yes target)
and becuse of that I prefer to lower the people without Heart Disease to this number
and not to use Over Sampling to Create more sampling with people with Heart Disease .
If I had small number of people with HeartDisease I'll use Over Sampling
rus = RandomUnderSampler()
X_train, X_test, y_train, y_test = train_test_split(df.drop('HeartDisease_1',axis=1), df['HeartDisease_1'], test_size=0.33, random_state=42)
X_rus_train, y_rus_train = rus.fit_resample(X_train, y_train)
X_rus_test, y_rus_test = rus.fit_resample(X_test, y_test)
Counter(y_rus_train)
Counter({0: 18225, 1: 18225})
Counter(y_rus_test)
Counter({0: 9148, 1: 9148})
df = pd.concat([X_rus_train,y_rus_train],axis=1)
df.head(1)
| PhysicalHealth | Smoking_1 | Stroke_1 | DiffWalking_1 | Sex_1 | AgeCategory_25-29 | AgeCategory_30-34 | AgeCategory_35-39 | AgeCategory_40-44 | AgeCategory_45-49 | AgeCategory_50-54 | AgeCategory_55-59 | AgeCategory_60-64 | AgeCategory_65-69 | AgeCategory_70-74 | AgeCategory_75-79 | AgeCategory_80 or older | Diabetic_1 | PhysicalActivity_1 | GenHealth_Fair | GenHealth_Good | GenHealth_Poor | GenHealth_Very good | HeartDisease_1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
def evaluate_model(model, x_test, y_test):
"""
This function gets model from skit-learn,
x_test and y_test type numpy array
and predict the y result using the model and the x_test.
Then returns dict with the model result parameters:
- accuracy
- precision
- recall score
- f1 score
- roc curve score
- roc auc score
"""
# Predict test date
y_pred = model.predict(x_test)
# Calculate basics scores
acc = met.accuracy_score(y_test, y_pred)
prec = met.precision_score(y_test, y_pred)
rec = met.recall_score(y_test, y_pred)
f1 = met.f1_score(y_test, y_pred)
# Calculate area under curve (AUC)
y_pred_proba = model.predict_proba(x_test)[::,1]
fpr, tpr, _ = met.roc_curve(y_test, y_pred_proba)
auc = met.roc_auc_score(y_test, y_pred_proba)
# confussion matrix
cm = met.confusion_matrix(y_test, y_pred)
return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1 ,
'fpr': fpr, 'tpr': tpr, 'auc': auc, 'cm': cm}
def print_model_result(model_dict):
"""
Gets dict of model and prints his numerical results
"""
print('Accuracy:', model_dict['acc'])
print('Precision:', model_dict['prec'])
print('Recall:', model_dict['rec'])
print('F1 Score:', model_dict['f1'])
print('Area Under Curve:', model_dict['auc'])
print('Confusion Matrix:\n', model_dict['cm'])
def draw_cm(title,cm):
"""
This function get title type str and cm type sklearn.metrics.confution_matrix
and draw heatmap of the confution matrix results
"""
plt.figure(figsize=(7,5))
ax = sns.heatmap(cm/np.sum(cm),fmt='.2%', annot=True, cmap='Blues')
ax.set_title(f"Confusion Matrix of {title} Model")
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(['No HeartDisease','HeartDisease'])
ax.yaxis.set_ticklabels(['No HeartDisease','HeartDisease'])
plt.show()
model=sklin.LogisticRegression()
model.fit(X_rus_train, y_rus_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
logistic_regression_result = evaluate_model(model, X_rus_test, y_rus_test)
print_model_result(logistic_regression_result)
Accuracy: 0.7608220376038478 Precision: 0.7523799449968267 Recall: 0.7775470048097944 F1 Score: 0.7647564777980862 Area Under Curve: 0.8378536425919472 Confusion Matrix: [[6807 2341] [2035 7113]]
draw_cm("Logistic Regression",logistic_regression_result["cm"])
model=tree.DecisionTreeClassifier()
model.fit(X_rus_train, y_rus_train)
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier()
dt_result = evaluate_model(model, X_rus_test, y_rus_test)
print_model_result(dt_result)
Accuracy: 0.725295146480105 Precision: 0.7310538116591928 Recall: 0.7128334062090075 F1 Score: 0.7218286473323002 Area Under Curve: 0.7654030958427598 Confusion Matrix: [[6749 2399] [2627 6521]]
draw_cm("DecisionTreeClassifier",dt_result["cm"])
model=KNeighborsClassifier()
model.fit(X_rus_train, y_rus_train)
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier()
knn_result = evaluate_model(model, X_rus_test, y_rus_test)
print_model_result(knn_result)
Accuracy: 0.6364779186707477 Precision: 0.8044379419653743 Recall: 0.36062527328377786 F1 Score: 0.49799984904521094 Area Under Curve: 0.7066569000676626 Confusion Matrix: [[8346 802] [5849 3299]]
draw_cm("KNeighborsClassifier",knn_result["cm"])
# Intitialize figure with two plots
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.suptitle('Model Comparison', fontsize=16, fontweight='bold')
fig.set_figheight(10)
fig.set_figwidth(20)
fig.set_facecolor('white')
# First plot
barWidth = 0.2
logistic_regression_result_score = [logistic_regression_result['acc'], logistic_regression_result['prec'], logistic_regression_result['rec'], logistic_regression_result['f1']]
dt_result_score = [dt_result['acc'], dt_result['prec'], dt_result['rec'], dt_result['f1']]
knn_result_score = [knn_result['acc'], knn_result['prec'], knn_result['rec'], knn_result['f1']]
## Set position of bar on X axis
r1 = np.arange(len(logistic_regression_result_score))
r2 = [x + barWidth for x in r1]
r3 = [x + 2*barWidth for x in r1]
## Make the plot
ax1.bar(r1, logistic_regression_result_score, width=barWidth, edgecolor='white', label='Logistic Regression')
ax1.bar(r2, dt_result_score, width=barWidth, edgecolor='white', label='Decision Tree')
ax1.bar(r3, knn_result_score, width=barWidth, edgecolor='white', label='K-Nearest Neighbors')
## Configure x and y axis
ax1.set_xlabel('Metrics', fontweight='bold')
labels = ['Accuracy', 'Precision', 'Recall', 'F1']
ax1.set_xticks([r + (barWidth * 1.5) for r in range(len(logistic_regression_result_score))], )
ax1.set_xticklabels(labels)
ax1.set_ylabel('Score', fontweight='bold')
ax1.set_ylim(0, 1)
## Create legend & title
ax1.set_title('Evaluation Metrics', fontsize=14, fontweight='bold')
ax1.legend()
# Second plot
## Comparing ROC Curve
ax2.plot(logistic_regression_result['fpr'], logistic_regression_result['tpr'], label='Logistic Regression auc = {:0.3f}'.format(logistic_regression_result['auc']))
ax2.plot(dt_result['fpr'], dt_result['tpr'], label='Decision Tree auc = {:0.3f}'.format(dt_result['auc']))
ax2.plot(knn_result['fpr'], knn_result['tpr'], label='K-Nearest Nieghbor auc = {:0.3f}'.format(knn_result['auc']))
## Configure x and y axis
ax2.set_xlabel('False Positive Rate', fontweight='bold')
ax2.set_ylabel('True Positive Rate', fontweight='bold')
## Create legend & title
ax2.set_title('ROC Curve', fontsize=14, fontweight='bold')
ax2.legend(loc=4)
plt.show()
When I first started the project I looked for interesting data, I searched on the Kaggle website and find this DataSet: https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease
This data looked interesting to me because it has a lot of comments and likes, and then when I looked into his columns I found a lot of indicators of heart disease: Age, BMI, Storke, Sex and even some indicators I did not even think about them: DiffWalking, kidney disease, Race and exc ...
First of all, I thought about My business needs and I understand what I'm trying to address is to understand how adult people can improve their health status
For the ML my y target is "heart disease" and I will predict the result (0/1) based on my features and when I will know the weight of each feature I will understand how adult people can Improve the chance of not getting heart disease.
After I looked at the data - I started to clean him and delete all the null values, after that, I created basic Visualisations for understanding the data pattern.
When I finished this part I created a heatmap and barh of the corr of the data related to the y-target (heart disease column) and figure out that Diffwalking, Stoke, Diabetic, physical health and kidney disease is the top 5 related to heart disease.
In the next part, I created Dummpy Variables for the ML training part and then Balance the data -> there was the ratio of 1:10 for the non Heart Deasese people so I did Random Under Sample on the data to decrease the people without heart disease and to make the ratio of 1: 1
I used 3 types of ML models:
And then create a plot to see the different on the results between them, I found the
accuracy_score, precision_score, recall_score, f1_score, predict_proba, roc_curve and roc_auc_score
When I look at the final plot that creates the competition between all the scores It seems to me that Logistic Registration is the best fit for this DataSet, It won on every score except for the precision score.
For my Business question: I will say to people that want to decrease their chances to get a heart deasse to:
# for expor